############################### Ontario House Prediction Analysis ##########################################
# This project explores predictive analysis for housing-related factors in Ontario
# Applying machine learning techniques.
# The goal is to provide insights into property prices, aiding stakeholders in making informed investment decisions.
# The analysis includes preprocessing data, applying predictive models, and assessing potential risks and opportunities in various Ontario areas.
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
# Load the dataset
housing = pd.read_csv("ontario_properties.csv")
housing.shape
(25351, 6)
housing.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 25351 entries, 0 to 25350 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 25351 non-null int64 1 Address 25351 non-null object 2 AreaName 24868 non-null object 3 Price ($) 25351 non-null int64 4 lat 25351 non-null float64 5 lng 25351 non-null float64 dtypes: float64(2), int64(2), object(2) memory usage: 1.2+ MB
# Display the first few rows of the dataset
housing.head()
| Unnamed: 0 | Address | AreaName | Price ($) | lat | lng | |
|---|---|---|---|---|---|---|
| 0 | 0 | 86 Waterford Dr Toronto, ON | Richview | 999888 | 43.679882 | -79.544266 |
| 1 | 1 | #80 - 100 BEDDOE DR Hamilton, ON | Chedoke Park B | 399900 | 43.250000 | -79.904396 |
| 2 | 2 | 213 Bowman Street Hamilton, ON | Ainslie Wood East | 479000 | 43.251690 | -79.919357 |
| 3 | 3 | 102 NEIL Avenue Hamilton, ON | Greenford | 285900 | 43.227161 | -79.767403 |
| 4 | 6 | #1409 - 230 King St Toronto, ON | Downtown | 362000 | 43.651478 | -79.368118 |
housing.drop(housing.columns[0],axis=1,inplace=True)
housing.columns
Index(['Address', 'AreaName', 'Price ($)', 'lat', 'lng'], dtype='object')
housing.describe()
| Price ($) | lat | lng | |
|---|---|---|---|
| count | 2.535100e+04 | 25351.000000 | 25351.000000 |
| mean | 5.645438e+05 | 37.326614 | -85.218379 |
| std | 8.475962e+05 | 82.858347 | 73.093572 |
| min | 0.000000e+00 | -999.000000 | -999.000000 |
| 25% | 2.199000e+05 | 43.401087 | -80.308159 |
| 50% | 3.719000e+05 | 43.715977 | -79.503342 |
| 75% | 6.199000e+05 | 44.466711 | -79.107326 |
| max | 3.250000e+07 | 53.851017 | 1.074519 |
# Check for missing values
print(housing.isnull().sum())
Address 0 AreaName 483 Price ($) 0 lat 0 lng 0 dtype: int64
housing = housing.dropna(subset=['AreaName'])
housing
| Address | AreaName | Price ($) | lat | lng | |
|---|---|---|---|---|---|
| 0 | 86 Waterford Dr Toronto, ON | Richview | 999888 | 43.679882 | -79.544266 |
| 1 | #80 - 100 BEDDOE DR Hamilton, ON | Chedoke Park B | 399900 | 43.250000 | -79.904396 |
| 2 | 213 Bowman Street Hamilton, ON | Ainslie Wood East | 479000 | 43.251690 | -79.919357 |
| 3 | 102 NEIL Avenue Hamilton, ON | Greenford | 285900 | 43.227161 | -79.767403 |
| 4 | #1409 - 230 King St Toronto, ON | Downtown | 362000 | 43.651478 | -79.368118 |
| ... | ... | ... | ... | ... | ... |
| 25346 | 3100 CARLING AVENUE UNIT#416 Ottawa, ON | Bayshore | 154900 | 45.353519 | -75.807793 |
| 25347 | 5827 GLADEWOODS PLACE Ottawa, ON | Orléans | 624900 | 45.441273 | -75.532745 |
| 25348 | 6349 DEERMEADOW DRIVE Ottawa, ON | Greely | 899000 | 45.238155 | -75.602249 |
| 25349 | 212 ALVIN ROAD Ottawa, ON | Rockcliffe - Manor Park | 295000 | 45.453838 | -75.650040 |
| 25350 | BARTON Street BARTON Street Stoney Creek, ON | Stoney Creek | 1595000 | 43.220070 | -79.689835 |
24868 rows × 5 columns
# Check available columns
housing.columns
Index(['Address', 'AreaName', 'Price ($)', 'lat', 'lng'], dtype='object')
### counts of properties in each area ############
counts = housing['AreaName'].value_counts()
counts
Downtown 830
Mississauga 752
Waterloo 459
Niagara Falls 355
Richmond Hill 335
...
Cavan 1
Thorah Island 1
Beechborough - Greenbrook 1
Broadview North 1
Appleby 1
Name: AreaName, Length: 1119, dtype: int64
# Create a bar chart
plt=counts[0:20].plot(kind='bar', color='purple',title="Top 20 Areas by Number of house sales")
#Group the data by "AreaName" and calculate the average price
average_prices = housing.groupby('AreaName')['Price ($)'].mean()
average_prices
AreaName
Aberfoyle 6.800000e+04
Acton 3.793000e+05
Agassiz 7.400000e+04
Agincourt 4.252873e+05
Agincourt North 2.200000e+06
...
York Mills 4.436242e+06
York University Heights 4.832300e+05
Yorkdale 1.208333e+05
Zephyr 1.142386e+06
Zurich 2.279000e+05
Name: Price ($), Length: 1119, dtype: float64
# Top 20 areas with highest average prices
top_n_areas = average_prices.nlargest(20)
# Bottom 20 areas with lowest average prices
bottom_n_areas = average_prices.nsmallest(20)
# Print or use these top and bottom areas as needed
print("Top 20 Areas:")
print(top_n_areas)
print("\nBottom 20 Areas:")
print(bottom_n_areas)
Top 20 Areas: AreaName Bridle Path 1.518329e+07 Appleby 9.000000e+06 Lytton Park 4.499900e+06 York Mills 4.436242e+06 Tempo 4.198333e+06 Edenbridge - Humber Valley 3.627822e+06 St. Andrew - Windfields 3.393979e+06 Armour Heights 3.270000e+06 Pelee Island 3.200000e+06 Crerar 3.199818e+06 Clarington 3.012860e+06 East Bayfront 2.785000e+06 Chestnut Hills 2.749900e+06 Windfields 2.707292e+06 Florida 2.700000e+06 Caribou Park 2.680000e+06 Pioneer Tower East 2.549950e+06 Thorncrest Village 2.537500e+06 West Queen West 2.396750e+06 Rural Oshawa 2.339980e+06 Name: Price ($), dtype: float64 Bottom 20 Areas: AreaName North End West 0.000000 Port Lands 0.000000 Bishop Hellmuth Heritage District 1883.333333 Trillium Industrial Park 3800.000000 Manitowaning 14999.000000 Clinton 19900.000000 Carnarvon 24500.000000 West Guilford 26900.000000 Old Brooklyn 29900.000000 White River 29900.000000 Casa Loma 35000.000000 Camden Township 39900.000000 Foymount 40000.000000 Marmora 40000.000000 Bloorcourt Village 45000.000000 Red Lake 46000.000000 St. Lawrence 48900.000000 Drummond 49000.000000 Douro 49900.000000 Martintown 49900.000000 Name: Price ($), dtype: float64
# Find the area with the highest average price
area_with_highest_price = average_prices.idxmax()
area_with_highest_price
'Bridle Path'
# find the highest average price
highest_average_price = average_prices.max()
highest_average_price
15183285.714285715
# Create a bar chart
top_20_areas = average_prices.sort_values(ascending=False).head(20)
plt=top_20_areas.plot(kind='bar', color='green',title="Top 20 Areas by Highest Average Price",ylabel="Average Price ($)")
# Create a bar chart
bottom_20_areas = average_prices.sort_values(ascending=True).head(20)
plt=bottom_20_areas.plot(kind='bar', color='red',title="bottom 20 Areas by Lowest Average Price",ylabel="Average Price ($)")
# Check available columns
housing.columns
Index(['Address', 'AreaName', 'Price ($)', 'lat', 'lng'], dtype='object')
from sklearn.model_selection import train_test_split
X = housing.drop(['Price ($)'],axis=1)
Y = housing['Price ($)']
from sklearn.linear_model import LinearRegression
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=42)
# Combine features and target variable for the training set
train_data = X_train.join(y_train)
train_data
| Address | AreaName | lat | lng | Price ($) | |
|---|---|---|---|---|---|
| 11223 | 32 CEDAR VALLEY DRIVE Kanata, ON | Kanata | 45.282478 | -75.850533 | 599000 |
| 9121 | 2306 Bois Vert Place Orleans, ON | Orléans | 45.458168 | -75.480278 | 219000 |
| 1711 | #1916 - 45 CARLTON ST Toronto, ON | Downtown | 43.661385 | -79.380608 | 599900 |
| 6761 | 993 Brookfield Road Ottawa, ON | Alta Vista | 45.374279 | -75.675613 | 535000 |
| 24355 | 6 Centre St Lansdowne, ON | Lansdowne | 44.404781 | -76.017311 | 189900 |
| ... | ... | ... | ... | ... | ... |
| 21928 | LOT 237 CAMBRIDGE Road W Crystal Beach, ON | Fort Erie | 42.864365 | -79.062050 | 49900 |
| 5495 | 250 Glenridge Drive Waterloo, ON | Waterloo | 43.482048 | -80.511574 | 349900 |
| 861 | #562 - 209 FORT YORK BLVD S Toronto, ON | Niagara | 43.636963 | -79.404671 | 349000 |
| 16073 | 85 OXFORD ST Richmond Hill, ON | Richmond Hill | 43.885166 | -79.448578 | 1649000 |
| 24099 | 22 Osprey Court Cambridge, ON | Fiddlesticks | 43.393806 | -80.285454 | 695000 |
19894 rows × 5 columns
train_data.hist()
array([[<Axes: title={'center': 'lat'}>, <Axes: title={'center': 'lng'}>],
[<Axes: title={'center': 'Price ($)'}>, <Axes: >]], dtype=object)
# Calculate the correlation matrix
correlation_matrix = train_data.corr()
correlation_matrix
/var/folders/yk/bhn8qs_94hb7nj2wx3cvz1y00000gn/T/ipykernel_89838/2707446889.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. correlation_matrix = train_data.corr()
| lat | lng | Price ($) | |
|---|---|---|---|
| lat | 1.000000 | 0.547325 | -0.048727 |
| lng | 0.547325 | 1.000000 | 0.013094 |
| Price ($) | -0.048727 | 0.013094 | 1.000000 |
import seaborn as sns
import matplotlib.pyplot as plt
# Create a heatmap
sns.heatmap(correlation_matrix, annot=True, cmap="viridis")
# Show the plot
plt.show()
# Extract numerical features
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
# Extract categorical features
categorical_features = X_train.select_dtypes(include=['object']).columns
# Define the columns that need one-hot encoding (categorical variables)
categorical_columns = ['AreaName']
# Create a ColumnTransformer to apply one-hot encoding
preprocessor = ColumnTransformer(
transformers=[
('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
('numeric', 'passthrough', numerical_features)
],
remainder='drop'
)
# Apply the preprocessing steps to training set
X_train_encoded = preprocessor.fit_transform(X_train)
# Apply the same preprocessing to the testing set
X_test_encoded = preprocessor.transform(X_test)
# Initialize the Linear Regression model
model = LinearRegression()
# Train the model on the encoded training data
model.fit(X_train_encoded, y_train)
# Make predictions on the encoded testing set
predictions = model.predict(X_test_encoded)
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
# Visualize predictions vs. actual values with area names
plt.scatter(y_test, predictions, alpha=0.5) # Using alpha for transparency
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs. Predicted Prices ')
plt.show()
Mean Squared Error: 468577098186.97034
import plotly.express as px
# Create a DataFrame with the necessary information
plot_data = pd.DataFrame({'Actual Prices': y_test, 'Predicted Prices': predictions, 'AreaName': X_test['AreaName']})
# Create an interactive scatter plot
fig = px.scatter(plot_data, x='Actual Prices', y='Predicted Prices', color='AreaName', hover_data=['AreaName'])
fig.update_layout(title='Actual vs. Predicted Prices with Area Names')
fig.show()
############ Top Areas #######################
# Get the top areas with highest predicted prices
top_predicted_areas = pd.DataFrame({'AreaName': X_test['AreaName'], 'Actual Price': y_test, 'Predicted Price': predictions})
top_predicted_areas = top_predicted_areas.sort_values(by='Predicted Price', ascending=False).head(10)
# Display the top predicted areas
print("Top Predicted Areas:")
print(top_predicted_areas)
Top Predicted Areas:
AreaName Actual Price Predicted Price
1611 Armour Heights 1590000 4.950576e+06
21830 York Mills 3980000 4.726823e+06
6499 York Mills 443888 4.726783e+06
16975 York Mills 398888 4.726783e+06
21103 York Mills 5999000 4.726777e+06
20786 York Mills 3300000 4.726767e+06
17793 York Mills 8800000 4.726767e+06
20868 York Mills 3195000 4.726748e+06
21148 Edenbridge - Humber Valley 7398000 3.410696e+06
10373 Edenbridge - Humber Valley 2495000 3.410695e+06
####################### Visualization ###################
# Visualize predicted prices for top areas
plt.figure(figsize=(12, 6))
sns.barplot(x='AreaName', y='Predicted Price', data=top_predicted_areas)
plt.title('Top Predicted Areas for Investment')
plt.xlabel('Area Name')
plt.ylabel('Predicted Price')
plt.xticks(rotation=45, ha='right')
plt.show()
################ Risk Assesment ##########
import pandas as pd
# Assuming 'y_test' is the actual prices and 'predictions' is the predicted prices
comparison_df = pd.DataFrame({'AreaName': X_test['AreaName'], 'Actual Price': y_test, 'Predicted Price': predictions})
# Calculate the standard deviation of predicted prices for each area
area_std_dev = comparison_df.groupby('AreaName')['Predicted Price'].std()
area_std_dev
AreaName
Aberfoyle NaN
Acton NaN
Agassiz NaN
Agincourt 65.731072
Agincourt North NaN
...
York NaN
York Mills 23.199804
York University Heights 54.837422
Yorkdale 13.570839
Zephyr 6.801444
Name: Predicted Price, Length: 765, dtype: float64
# Visualize variability in predictions(Bar chart )
plt.figure(figsize=(15, 7))
area_std_dev.sort_values().plot(kind='bar', color='purple')
plt.title('Variability in Predicted Prices for Different Areas')
plt.xlabel('Area Name')
plt.ylabel('Standard Deviation of Predicted Price')
plt.xticks(rotation=30, ha='right')
plt.show()
## Visualize variability in predictions(Scatter plot)
plt.figure(figsize=(15, 7))
plt.scatter(area_std_dev.index, area_std_dev.values, color='purple')
plt.title('Variability in Predicted Prices for Different Areas')
plt.xlabel('Area Name')
plt.ylabel('Standard Deviation of Predicted Price')
plt.xticks(rotation=30, ha='right')
plt.show()
# Generate a recommendation report
recommendation_report = """
Investment Recommendation Report
Top Predicted Areas for Investment:
{}
Analysis Summary:
- The top areas with the highest predicted prices present attractive investment opportunities.
- Variability in predictions should be considered for risk assessment.
- note that a higher standard deviation may be considered as an indicator of higher risk,
as it suggests that the predicted prices for that area are less consistent or more volatile.
""".format(top_predicted_areas.to_string(index=False))
print(recommendation_report)
Investment Recommendation Report
Top Predicted Areas for Investment:
AreaName Actual Price Predicted Price
Armour Heights 1590000 4.950576e+06
York Mills 3980000 4.726823e+06
York Mills 443888 4.726783e+06
York Mills 398888 4.726783e+06
York Mills 5999000 4.726777e+06
York Mills 3300000 4.726767e+06
York Mills 8800000 4.726767e+06
York Mills 3195000 4.726748e+06
Edenbridge - Humber Valley 7398000 3.410696e+06
Edenbridge - Humber Valley 2495000 3.410695e+06
Analysis Summary:
- The top areas with the highest predicted prices present attractive investment opportunities.
- Variability in predictions should be considered for risk assessment.
- note that a higher standard deviation may be considered as an indicator of higher risk,
as it suggests that the predicted prices for that area are less consistent or more volatile.